Introduction

Virtual idols originate from Japan, with roots in anime and Japanese idol culture, and dating back to the 1980s. In the last decade, vtubers or virtual youtubers have emerged due to the popularity of Internet and its devices like smartphones, PCs and 3D-detectors.

Vtuber in Bilibili platform (also named as vup) is an live-streaming entertainer who uses a virtual avatar generated using computer graphics and moved by real-time motion capture software. Danmakus shows hundreds of vups earned more than 0.1 million CNY every month and some can even earn 1 million CNY. Bilibili’s virtual idol is a market with a billion CNY level every year.

One question of interest is what factors influence income in vtuber live streams and how. We analyze the live data from different livers and aim to

Data analysis

data and setup

Live data is from wandleshen@github. They collect data with Danmakus API from last 50 live streams of 50 livers each at 2023-02-10 16:42:54, all 2477 live streams. (Not every liver has 50 live streams.) Vtuber information data are manually collected from homepages, in which liver,id,affiliation are from wandleshen@github and sex,country,ip are from myself.

df <- read.csv("data.csv")
vtb=read.csv("vtb2.csv")
str(vtb)
## 'data.frame':    50 obs. of  7 variables:
##  $ X          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ liver      : chr  "七海Nana7mi" "中单光一" "内德维德" "弥希Miki" ...
##  $ affiliation: chr  "vr" "vr" "independent" "vr" ...
##  $ id         : int  434334701 434401868 90873 477317922 477342747 477306079 480680646 56748733 558070433 666726799 ...
##  $ sex        : chr  "女" "男" "男" "女" ...
##  $ country    : chr  "中国" "中国" "中国" "中国" ...
##  $ ip         : chr  "上海" "上海" "新疆" "江苏" ...
vtb$X=NULL
vtb$sex=as.factor(vtb$sex)
vtb$ip=as.factor(vtb$ip)
vtb$country=as.factor(vtb$country)
library(GGally)
library(pheatmap)
library(reshape2)
library(dplyr)
library(ggplot2)
library(lubridate)
library(extrafont)
library(marginaleffects)
library(MatchIt)
library(grf)
library(randomForest)
#font_import()
#fonts()
str(df)
## 'data.frame':    2477 obs. of  17 variables:
##  $ liver               : chr  "-阿蕊娅Aria-" "-阿蕊娅Aria-" "-阿蕊娅Aria-" "-阿蕊娅Aria-" ...
##  $ area                : chr  "虚拟Singer" "虚拟Singer" "虚拟主播" "虚拟主播" ...
##  $ danmakusCount       : int  2627 3762 2167 4039 1393 1418 3383 3101 3699 5789 ...
##  $ startDate           : num  1.68e+12 1.68e+12 1.68e+12 1.68e+12 1.68e+12 ...
##  $ stopDate            : num  1.68e+12 1.68e+12 1.68e+12 1.68e+12 1.68e+12 ...
##  $ timeDuration        : int  11054514 11844848 8461954 14922527 46881886 7544343 11505659 10361140 11247584 15638131 ...
##  $ title               : chr  "下面我来简单唱两句" "赫赫,我来简单唱两句" "早上好!唱一会儿~" "歌杂~" ...
##  $ totalIncome         : num  1754 11057 3852 8225 666 ...
##  $ watchCount          : int  4078 4928 3382 4455 5452 2698 4348 4544 4997 6103 ...
##  $ interactionCount    : int  468 649 451 575 355 317 530 501 550 772 ...
##  $ superchatCount      : int  16 12 7 19 3 10 12 14 24 19 ...
##  $ superchatIncome     : int  680 597 230 3276 90 428 430 1480 930 890 ...
##  $ superchatTimestamps : chr  "[1675954450657, 1675955951334, 1675957817542, 1675957914753, 1675958621817, 1675958692099, 1675958916597, 16759"| __truncated__ "[1675864937946, 1675865846334, 1675866462953, 1675866781825, 1675867310789, 1675869256286, 1675871757716, 16758"| __truncated__ "[1675728302746, 1675729527578, 1675730123898, 1675734044137, 1675734117096, 1675734908071, 1675735637727]" "[1675605761897, 1675606540268, 1675606864640, 1675607128304, 1675607144758, 1675607910721, 1675608046866, 16756"| __truncated__ ...
##  $ membershipCount     : int  0 11 9 6 3 2 13 8 22 4 ...
##  $ membershipIncome    : int  0 7358 1242 2688 414 276 5114 1164 26134 16412 ...
##  $ membershipTimestamps: chr  "[]" "[1675868606293, 1675868631905, 1675869343361, 1675869438134, 1675869457256, 1675869889257, 1675870029686, 16758"| __truncated__ "[1675727902618, 1675728119437, 1675728413698, 1675728525204, 1675733327041, 1675733557159, 1675733721169, 16757"| __truncated__ "[1675608154614, 1675609365272, 1675609368458, 1675613019747, 1675615266252, 1675615911612]" ...
##  $ followers           : int  34583 34583 34583 34583 34583 34583 34583 34583 34583 34583 ...
summary(df)
##     liver               area           danmakusCount      startDate        
##  Length:2477        Length:2477        Min.   :     0   Min.   :1.652e+12  
##  Class :character   Class :character   1st Qu.:  2251   1st Qu.:1.672e+12  
##  Mode  :character   Mode  :character   Median :  5227   Median :1.674e+12  
##                                        Mean   : 11817   Mean   :1.673e+12  
##                                        3rd Qu.: 11989   3rd Qu.:1.675e+12  
##                                        Max.   :210878   Max.   :1.676e+12  
##     stopDate          timeDuration          title            totalIncome      
##  Min.   :1.652e+12   Min.   :-26494026   Length:2477        Min.   :     0.0  
##  1st Qu.:1.672e+12   1st Qu.:  7642526   Class :character   1st Qu.:   542.9  
##  Median :1.674e+12   Median : 10096143   Mode  :character   Median :  1550.4  
##  Mean   :1.673e+12   Mean   : 11152512                      Mean   :  5491.7  
##  3rd Qu.:1.675e+12   3rd Qu.: 14005235                      3rd Qu.:  4392.5  
##  Max.   :1.676e+12   Max.   : 70100660                      Max.   :450768.7  
##    watchCount     interactionCount superchatCount    superchatIncome
##  Min.   :     0   Min.   :    0    Min.   :   0.00   Min.   :    0  
##  1st Qu.:  5000   1st Qu.:  499    1st Qu.:   2.00   1st Qu.:   90  
##  Median : 10595   Median : 1108    Median :   9.00   Median :  352  
##  Mean   : 21753   Mean   : 2131    Mean   :  25.05   Mean   : 1338  
##  3rd Qu.: 27410   3rd Qu.: 2644    3rd Qu.:  27.00   3rd Qu.: 1170  
##  Max.   :265166   Max.   :25175    Max.   :1724.00   Max.   :96989  
##  superchatTimestamps membershipCount   membershipIncome membershipTimestamps
##  Length:2477         Min.   :   0.00   Min.   :     0   Length:2477         
##  Class :character    1st Qu.:   1.00   1st Qu.:   138   Class :character    
##  Mode  :character    Median :   3.00   Median :   474   Mode  :character    
##                      Mean   :  12.75   Mean   :  3020                       
##                      3rd Qu.:   8.00   3rd Qu.:  1716                       
##                      Max.   :2173.00   Max.   :386712                       
##    followers      
##  Min.   :  34583  
##  1st Qu.: 115577  
##  Median : 263239  
##  Mean   : 354179  
##  3rd Qu.: 486642  
##  Max.   :1339125

Exploratory Data Analysis

ggplot(data = df, aes(x = (startDate-min(startDate))/8.64e07)) + geom_histogram(binwidth=1)+
  xlab("Day")

ggplot(data = df, aes(x = timeDuration/3.6e06)) + geom_histogram(binwidth=1)+
  xlab("Hour")

Transform the date type and remove streams which timeDuration<0.

df_tf=mutate(df, posix_startDate = with_tz(as.POSIXct(startDate / 1000, origin = "1970-01-01", tz = "UTC"), "Asia/Shanghai"),
             posix_stopDate = with_tz(as.POSIXct(stopDate / 1000, origin = "1970-01-01", tz = "UTC"), "Asia/Shanghai"))

df_tf=df_tf%>%
  mutate(weekday=factor(weekdays(posix_startDate), levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")),timeDuration=timeDuration/3.6e06)%>%
  filter(timeDuration>0)%>%
  merge(vtb)
ggplot(melt(df_tf[,c(3,6,8,9,10,11,12,14,15,18)],id="posix_startDate"),
            aes(x = posix_startDate,y = value) )+
  geom_line()+
  facet_wrap(~variable,scales = "free_y",ncol=3)

Plot the time series and find three peaks at about 2022-10-30, Christmas Eve and Spring Festival.

df_tf%>%arrange(desc(membershipIncome))%>%
  transmute(liver,title,membershipIncome,totalIncome,posix_startDate)%>%head()
##              liver                              title membershipIncome
## 1         美月もも               【美月もも】出道首播           386712
## 2 阿梓从小就很可爱                       晚好晚好晚好           362044
## 3  雫るる_Official             LULU3D公布回!让你心动           288424
## 4          阿萨Aza               除夕快乐啊啊啊啊啊!           185988
## 5       雾深Girimi 【新衣回】超可爱的熊猫不是猫吗;;           154416
## 6          阿萨Aza                   又是一年平安夜!           134574
##   totalIncome     posix_startDate
## 1    450768.7 2022-10-30 18:56:52
## 2    403178.9 2022-12-20 19:01:26
## 3    335778.0 2023-01-24 19:34:24
## 4    299294.0 2023-01-21 21:02:12
## 5    172216.0 2023-01-22 19:57:23
## 6    200934.5 2022-12-24 19:58:19
par(mfrow=c(1,2))

df_tf%>%
  group_by(weekadays=factor(weekday, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))%>%
  summarise(watchCount = median(watchCount),
            interactionCount = median(interactionCount),
            totalIncome = median(totalIncome),
            danmakusCount = median(danmakusCount),
            superchatIncome=median(superchatIncome),
            membershipIncome=median(membershipIncome)
            )%>%
  plot(main="median per weekday")

df_tf%>%
  group_by(weekdays=factor(weekday, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))%>%
  summarise(watchCount = sum(watchCount),
            interactionCount = sum(interactionCount),
            totalIncome = sum(totalIncome),
            danmakusCount = sum(danmakusCount),
            superchatIncome=sum(superchatIncome),
            membershipIncome=sum(membershipIncome)
            )%>%
  plot(main="sum per weekday")

Saturday is the best day and Sunday is the second. I guess mang viewers work till Saturday evening.

df_tf%>%
  transmute(danmakusCount,interactionCount,timeDuration,totalIncome,superchatCount,superchatIncome,membershipCount,membershipIncome,followers)%>%
  cor()%>%
  pheatmap()

Every two variables have positive correlation. hclust (Hierarchical Clustering) shows that three clusters: followers, timeDuration, totalIncome,membershipCount,membershipIncome and superchatCount,superchatIncome,danmakusCount,interactionCount .

ggplot(df_tf%>%group_by(liver)%>%summarize(mean.totalIncome=mean(totalIncome),mean.watchCount=mean(watchCount),followers=mean(followers), affiliation=affiliation[1], mean.danmakusCount=mean(danmakusCount), mean.superchatCount=mean(superchatCount),mean.membershipCount=mean(membershipCount), mean.superchatIncome=mean(superchatIncome), mean.membershipIncome=mean(membershipIncome) ), aes(x=log(mean.watchCount), y=log(mean.totalIncome), size = followers, color = affiliation)) +
  geom_point(alpha=0.6)+
  geom_text(aes(label = liver),hjust=0, vjust=0,family="Songti SC")+
  theme_bw(base_family = "Songti SC")+
  scale_colour_brewer(palette="Set1")+
  geom_smooth(method=lm,aes(color = NULL))+
  guides(size = guide_legend(override.aes = list(linetype = 0)),
         color = guide_legend(override.aes = list(linetype = 0)))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The following aesthetics were dropped during statistical transformation: size
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Linear regression performs well

ggplot(df_tf%>%group_by(weekday)%>%summarize(mean.totalIncome=mean(totalIncome),mean.watchCount=mean(watchCount),followers=mean(followers),  mean.danmakusCount=mean(danmakusCount), mean.superchatCount=mean(superchatCount),mean.membershipCount=mean(membershipCount), mean.superchatIncome=mean(superchatIncome), mean.membershipIncome=mean(membershipIncome) ), aes(x=log(mean.watchCount), y=log(mean.totalIncome), size = mean.membershipCount, color = mean.superchatCount)) +
geom_point(alpha=0.6)+
  geom_smooth(method=lm,aes(color = NULL))+
  geom_text(aes(label = weekday),hjust=0, vjust=0,family="Songti SC")+
  scale_fill_viridis_c()+
  guides(size = guide_legend(override.aes = list(linetype = 0)),
         color = guide_legend(override.aes = list(linetype = 0)))
## Warning: The following aesthetics were dropped during statistical transformation: size
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

stamp process

ggplot(melt(data.frame(time=time_vec,
                       superchat=superchat_num,membership=membership_num,
                       start=start_num,stop=stop_num,liver=online_num),id="time"),
            aes(x = time,y = value) )+
  geom_path()+
  facet_wrap(~variable,scales = "free_y",ncol=2)+
  scale_x_continuous(breaks = seq(0, 24, by = 2))

There are peaks at the hours and the really high peak at 0:00. In addition, membershipCount keeps high in 18:00-22:00 and superchatCount keeps high in 20:00-24:00.

Most livers start at hours like 20:00 which leads to peaks of membershipCount and superchatCount. The peak at 0:00 is possibly due to meaning of beginning and Bilibili automatic renewal.

Future Plan

PCA and kmeans

col_num<-unlist(lapply(df_tf, is.numeric))
col_num[c(4,5,22)]=FALSE
col_num
##                liver                 area        danmakusCount 
##                FALSE                FALSE                 TRUE 
##            startDate             stopDate         timeDuration 
##                FALSE                FALSE                 TRUE 
##                title          totalIncome           watchCount 
##                FALSE                 TRUE                 TRUE 
##     interactionCount       superchatCount      superchatIncome 
##                 TRUE                 TRUE                 TRUE 
##  superchatTimestamps      membershipCount     membershipIncome 
##                FALSE                 TRUE                 TRUE 
## membershipTimestamps            followers      posix_startDate 
##                FALSE                 TRUE                FALSE 
##       posix_stopDate              weekday          affiliation 
##                FALSE                FALSE                FALSE 
##                   id                  sex              country 
##                FALSE                FALSE                FALSE 
##                   ip 
##                FALSE
pca<-prcomp(log(df_tf[,col_num]+1))
pca
## Standard deviations (1, .., p=10):
##  [1] 4.3859910 1.9775653 1.3326655 0.7648005 0.6608613 0.5496997 0.4207956
##  [8] 0.3555255 0.3222402 0.1769085
## 
## Rotation (n x k) = (10 x 10):
##                         PC1         PC2         PC3         PC4          PC5
## danmakusCount    0.24060079  0.23069642 -0.28830970 -0.35968744  0.415100484
## timeDuration     0.03480811  0.02258354 -0.04564358 -0.15559743  0.139998427
## totalIncome      0.38407619 -0.01420061  0.08108918 -0.60589014 -0.671482260
## watchCount       0.20663207  0.23424574 -0.52543785  0.01450248  0.003667899
## interactionCount 0.21223524  0.21212031 -0.38890992 -0.05969538  0.138928938
## superchatCount   0.28047934  0.26573138  0.14920323  0.10043430  0.204527613
## superchatIncome  0.47751485  0.47101377  0.53594764  0.28632304  0.008519136
## membershipCount  0.22350211 -0.18175075 -0.01561185 -0.05525921 -0.013342268
## membershipIncome 0.58442558 -0.70930140 -0.04772469  0.22633560  0.185782466
## followers        0.08630751  0.13263650 -0.41092580  0.57374938 -0.511166938
##                           PC6         PC7         PC8          PC9         PC10
## danmakusCount    -0.040905103  0.30152039  0.42739172  0.001956260  0.478360942
## timeDuration      0.112771669 -0.22191173  0.09273304  0.910086096 -0.231444465
## totalIncome       0.071029872  0.12883885 -0.07403416  0.014657683 -0.019403354
## watchCount        0.228685323 -0.52529817 -0.47734612 -0.081947390  0.251621895
## interactionCount -0.007682955  0.08147744  0.15444338 -0.262273993 -0.797814175
## superchatCount   -0.369691934  0.45174185 -0.63201549  0.187657301 -0.033709345
## superchatIncome   0.194132479 -0.26531312  0.26085571 -0.067037130  0.014632597
## membershipCount  -0.824153650 -0.45878201  0.14956725 -0.031844804  0.021060015
## membershipIncome  0.238920895  0.09800830 -0.02366265  0.001740982  0.008118608
## followers        -0.132423933  0.25713060  0.24723311  0.235149398  0.124687105
plot(cumsum((pca$sdev))/sum(pca$sdev))

biplot(pca)

biplot(pca,c(1,3))

biplot(pca,2:3)

proc0<-function(dt){
  for(i in 1:nrow(dt)){
  for(j in 1:ncol(dt)){
    dt[i,j]=max(dt[i,j],0.5)
  }
}
return(dt)
}
clust<-kmeans(as.matrix(log(proc0(df_tf[,col_num])) ),3)
clust
## K-means clustering with 3 clusters of sizes 904, 1098, 456
## 
## Cluster means:
##   danmakusCount timeDuration totalIncome watchCount interactionCount
## 1      7.961558    0.9322399    6.704376   8.733507         6.470306
## 2      9.593734    1.1970757    8.612388  10.182666         7.929021
## 3      7.494122    0.7317177    4.886031   8.378117         6.059340
##   superchatCount superchatIncome membershipCount membershipIncome followers
## 1      1.0970877        4.225044       0.7665128        5.8091397  12.05215
## 2      3.4345804        7.249137       2.2128304        7.5777277  12.78910
## 3      0.6306728        3.046705      -0.6931472       -0.6931472  12.04678
## 
## Clustering vector:
##    [1] 3 2 1 2 1 1 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 1 3 2 1 2 1 3 1 1 1 3 2 1 1 2 3
##   [38] 1 2 2 3 1 3 1 1 2 1 3 1 2 3 3 1 3 3 3 1 1 3 1 1 3 1 2 3 3 1 3 3 1 3 2 1 3
##   [75] 1 3 2 1 3 1 1 3 1 1 1 1 1 3 3 1 3 3 1 3 1 1 3 3 1 1 2 2 2 2 2 2 2 2 2 2 2
##  [112] 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [149] 2 2 1 2 2 1 2 1 2 2 3 2 2 3 2 2 1 2 2 1 2 2 2 2 2 1 2 1 2 1 3 2 2 1 2 1 3
##  [186] 2 2 3 2 2 2 2 2 1 2 1 3 2 2 2 2 1 3 3 3 2 1 2 1 1 2 2 2 2 1 2 2 1 1 2 3 2
##  [223] 2 3 2 2 3 2 1 1 1 1 1 2 1 1 1 3 1 3 2 1 1 1 1 1 2 1 1 3 1 3 3 3 1 3 1 1 1
##  [260] 1 1 2 3 1 1 1 1 3 1 1 1 3 1 3 1 3 3 3 3 1 1 1 1 3 1 3 3 1 1 3 1 3 1 1 1 3
##  [297] 1 1 1 1 3 1 1 3 2 1 3 3 1 3 1 2 1 3 1 3 1 1 1 2 2 3 1 1 2 1 2 3 2 1 2 2 1
##  [334] 1 1 2 2 1 2 2 3 1 3 2 2 2 3 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2
##  [371] 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 3 3 3 1 3 1
##  [408] 1 1 2 1 3 1 1 1 1 1 1 2 1 1 3 3 3 2 3 1 3 1 1 1 3 1 1 3 1 3 3 1 1 1 1 2 2
##  [445] 3 1 3 1 3 1 1 1 3 3 1 1 1 1 1 1 1 1 2 2 2 1 1 3 1 1 1 1 1 1 3 3 1 1 1 1 3
##  [482] 1 1 2 1 1 1 2 3 1 1 1 1 1 3 3 2 1 3 1 3 3 3 3 3 1 3 1 3 1 1 3 2 2 3 3 1 3
##  [519] 3 3 2 1 3 1 1 1 1 1 3 1 3 1 3 1 1 3 2 3 3 1 1 3 3 1 3 2 1 2 1 1 3 3 1 3 1
##  [556] 3 2 2 2 3 3 1 3 2 3 3 1 2 1 2 2 2 1 2 1 2 1 3 2 3 1 3 2 2 1 1 2 1 2 2 2 2
##  [593] 1 3 2 3 3 1 1 1 1 2 1 1 3 1 2 1 1 1 2 1 1 3 1 2 2 3 2 1 1 1 1 1 2 1 1 1 3
##  [630] 3 2 3 1 3 1 1 1 1 3 2 1 1 2 1 1 1 1 1 3 1 3 1 3 1 3 1 1 1 1 1 1 1 1 1 1 1
##  [667] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 3 3 1 1 1 3 2 1 2 2 2 2 2 3
##  [704] 1 1 2 2 1 3 2 2 2 2 2 2 1 1 1 3 1 3 3 2 3 3 3 2 2 1 1 1 2 2 2 1 1 1 1 2 1
##  [741] 2 1 1 1 1 3 1 3 1 3 1 2 2 3 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 3 2 2 2 1 2
##  [778] 2 3 1 1 2 2 1 2 1 1 1 1 1 3 1 1 1 3 1 1 1 1 1 1 1 1 3 1 2 1 2 1 1 1 3 1 1
##  [815] 1 3 2 3 3 1 3 3 3 3 3 3 3 3 1 2 2 1 2 1 1 1 3 1 2 2 2 2 2 2 1 3 2 1 1 2 2
##  [852] 2 2 3 2 2 2 2 2 1 1 2 2 2 1 2 2 2 2 3 2 1 2 2 2 2 2 2 3 1 2 2 2 2 2 1 2 2
##  [889] 2 2 2 1 2 2 2 1 2 3 2 2 2 2 2 1 3 3 2 2 3 3 2 2 2 1 2 1 2 2 2 2 2 1 2 2 1
##  [926] 3 3 3 1 2 3 1 2 1 1 3 1 1 2 3 1 1 1 2 1 1 1 1 3 3 3 3 1 3 1 1 3 2 1 1 3 3
##  [963] 1 1 1 3 3 1 1 1 1 1 1 1 1 3 1 3 1 2 1 1 1 1 1 1 1 1 3 1 1 1 2 1 2 1 3 3 3
## [1000] 1 1 1 1 3 3 2 1 3 1 1 2 2 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 3 3 2 2
## [1037] 2 2 2 2 2 3 2 2 2 2 2 1 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [1074] 2 2 2 2 1 1 2 2 2 1 1 2 1 2 2 2 2 1 2 3 1 1 2 2 2 2 2 2 2 2 3 3 2 2 1 2 2
## [1111] 3 2 2 2 2 2 3 1 2 2 2 1 2 2 3 3 2 1 1 1 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [1148] 2 2 2 2 2 3 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 3 2 2 2 2 2 2
## [1185] 1 1 1 1 3 2 2 1 1 1 1 1 1 1 1 2 1 1 1 3 3 3 1 1 1 1 1 1 1 1 3 3 1 3 1 3 2
## [1222] 2 1 1 1 3 3 1 3 3 3 3 1 3 2 3 1 2 3 2 2 2 2 2 2 2 2 2 2 2 2 1 1 3 2 2 3 1
## [1259] 2 1 3 3 1 1 2 1 1 1 1 1 2 1 3 1 3 1 1 1 1 1 1 1 2 1 1 2 2 1 2 2 1 2 2 2 1
## [1296] 2 2 2 2 3 1 2 2 1 1 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 3 2 2 1 1 2 1 2 2 1 2 1
## [1333] 1 1 3 1 1 3 1 3 1 2 1 3 1 3 1 1 1 1 1 3 3 1 1 1 1 3 3 1 1 2 2 1 1 3 1 3 3
## [1370] 1 1 3 1 1 1 1 3 1 3 3 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2
## [1407] 1 2 2 2 2 3 2 1 2 3 2 1 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 2
## [1444] 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 2 2 3 2 1 2 3 2
## [1481] 3 2 1 1 1 1 1 2 1 1 2 3 3 1 1 2 1 1 3 3 3 2 2 1 1 1 1 2 1 1 3 3 1 1 3 1 3
## [1518] 3 1 3 1 1 2 1 3 3 1 3 1 1 2 1 3 1 1 1 1 3 1 1 3 3 1 1 3 1 3 1 1 3 1 1 1 1
## [1555] 1 1 1 1 3 1 1 3 1 2 2 1 1 2 1 3 1 3 1 3 1 1 1 3 1 3 1 3 1 2 2 2 1 2 2 2 1
## [1592] 1 2 2 2 2 2 2 2 2 2 2 2 2 2 3 1 3 2 2 1 1 2 1 1 2 2 2 2 2 2 1 2 1 2 2 2 2
## [1629] 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [1666] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 1 1 1 3 1 1 2 1 1 1 1 3 2 1 3 1 1 1
## [1703] 1 2 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 3 1 1 1 1 3 3 1 1 1 1 2 2 1 2 2 2 2 1 1
## [1740] 2 1 1 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 1 2 1 2 1 3 1 2 1 2 2 2 3 3 1 3 2 3 2
## [1777] 2 1 1 3 3 1 3 3 3 3 3 1 1 3 1 1 3 3 3 3 1 1 3 1 1 3 1 3 3 1 3 3 3 1 3 3 3
## [1814] 3 1 3 1 3 3 1 1 3 3 1 3 3 1 1 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [1851] 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2 2 2 1 1 1 1 1 1 1 1 3 1 1 3 1 1 3 1
## [1888] 2 1 3 1 1 3 3 3 3 1 3 1 3 3 1 3 3 1 1 3 1 3 1 1 1 1 3 1 1 1 3 3 3 1 3 1 2
## [1925] 1 2 1 1 3 3 1 1 1 2 1 2 1 1 2 2 2 2 3 2 3 3 3 1 1 1 2 2 3 1 2 2 1 2 2 3 2
## [1962] 1 2 1 1 3 1 2 1 2 3 2 3 1 1 2 1 2 3 1 1 2 1 2 1 1 1 1 2 3 2 1 1 2 1 2 1 1
## [1999] 1 2 1 2 1 2 1 3 2 1 1 2 1 2 2 1 3 3 2 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 2
## [2036] 1 2 1 1 3 3 3 1 3 3 3 1 3 3 1 3 1 1 3 3 3 3 2 1 1 1 3 3 1 3 3 3 3 3 3 3 3
## [2073] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 3 1 1 1 1 1 1 3 1 1 3 1 1 3 1 3 1 2 1 1 1
## [2110] 1 1 1 2 3 1 1 2 2 1 1 2 2 1 1 1 2 2 2 2 1 2 2 2 2 1 1 2 1 1 1 1 3 1 1 1 3
## [2147] 3 1 1 1 1 3 1 3 3 1 1 2 1 2 2 3 1 1 1 2 2 2 2 2 2 3 2 2 2 2 2 3 1 2 2 2 2
## [2184] 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2
## [2221] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2
## [2258] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 1 2 2 2 1 1 2 2 3 2 2 2 2 2 2 2 2 2
## [2295] 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 3
## [2332] 2 2 2 2 3 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2
## [2369] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## [2406] 2 3 2 2 2 2 2 2 1 2 2 2 1 2 2 1 1 2 2 2 2 2 1 1 1 2 2 2 2 2 1 1 1 1 1 1 2
## [2443] 1 3 1 3 1 2 3 3 1 3 3 3 2 1 2 1
## 
## Within cluster sum of squares by cluster:
## [1]  9313.457 10191.177  9045.927
##  (between_SS / total_SS =  62.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
sum(df_tf$membershipCount==0)
## [1] 456
plot(pca$x[,1:2] , col=clust$cluster)

plot(pca$x[,c(1,3)] , col=clust$cluster)

plot(pca$x[,2:3] , col=clust$cluster)

table(df_tf[clust$cluster==1,"weekday"])
## 
##    Monday   Tuesday Wednesday  Thursday    Friday  Saturday    Sunday 
##       112       127       130       125       147       137       126
table(df_tf[clust$cluster==2,"weekday"])
## 
##    Monday   Tuesday Wednesday  Thursday    Friday  Saturday    Sunday 
##       111       119       143       153       152       226       194
table(df_tf[clust$cluster==3,"weekday"])
## 
##    Monday   Tuesday Wednesday  Thursday    Friday  Saturday    Sunday 
##        58        69        70        68        65        58        68
(table(df_tf[clust$cluster==1,"affiliation"]))
## 
## another-project       chaoslive     independent     meta-mythos      providence 
##              39              10             231              27              56 
##             psp              vr          虚研社 
##              92             400              49
(table(df_tf[clust$cluster==2,"affiliation"]))
## 
## another-project       chaoslive       chucolala     independent     meta-mythos 
##              22              31              50             492              16 
##      providence             psp              vr          虚研社 
##              18             117             327              25
(table(df_tf[clust$cluster==3,"affiliation"]))
## 
## another-project       chaoslive     independent     meta-mythos      providence 
##              39               9             117               7              26 
##             psp              vr          虚研社 
##              37             196              25

Random Forest

df_rf=df_tf
df_rf$liver=factor(df_rf$liver)
df_rf$area=factor(df_rf$area)
df_rf$affiliation=factor(df_rf$affiliation)
df_rf$title=NULL
df_rf$startDate=NULL
df_rf$stopDate=NULL
df_rf$superchatTimestamps=NULL
df_rf$membershipTimestamps=NULL
df_rf$posix_startDate=NULL
df_rf$posix_stopDate=NULL
df_rf$id=NULL
set.seed(123456)
N=nrow(df_rf)
ind_train=sample(1:N,ceiling(N*0.8))
ind_test=(1:N)[-ind_train]
train=df_rf[ind_train,c(3:8,10,12)]
test=df_rf[ind_test,c(3:8,10,12)]
summary(train)
##  danmakusCount     timeDuration       totalIncome         watchCount    
##  Min.   :     0   Min.   : 0.00233   Min.   :     0.0   Min.   :     0  
##  1st Qu.:  2188   1st Qu.: 2.13029   1st Qu.:   543.5   1st Qu.:  5012  
##  Median :  5125   Median : 2.79710   Median :  1556.0   Median : 10596  
##  Mean   : 11815   Mean   : 3.16621   Mean   :  5531.4   Mean   : 21457  
##  3rd Qu.: 11694   3rd Qu.: 3.94296   3rd Qu.:  4384.3   3rd Qu.: 27057  
##  Max.   :192441   Max.   :19.47241   Max.   :450768.7   Max.   :204768  
##  interactionCount  superchatCount    membershipCount     followers      
##  Min.   :    0.0   Min.   :   0.00   Min.   :   0.00   Min.   :  34583  
##  1st Qu.:  495.5   1st Qu.:   2.00   1st Qu.:   1.00   1st Qu.: 115577  
##  Median : 1082.0   Median :   9.00   Median :   3.00   Median : 263239  
##  Mean   : 2117.1   Mean   :  24.53   Mean   :  13.32   Mean   : 353845  
##  3rd Qu.: 2602.0   3rd Qu.:  27.00   3rd Qu.:   8.00   3rd Qu.: 486642  
##  Max.   :24202.0   Max.   :1102.00   Max.   :2173.00   Max.   :1339125
for(i in 1:nrow(train)){
  for(j in 1:ncol(train)){
    train[i,j]=max(train[i,j],0.1)
  }
}
for(i in 1:nrow(test)){
  for(j in 1:ncol(test)){
    test[i,j]=max(test[i,j],0.1)
  }
}
library(randomForest)

set.seed(123456)
rf <- randomForest(totalIncome ~ ., data=log(train),ntree=100,
                         importance=TRUE, na.action=na.omit)
print(rf)
## 
## Call:
##  randomForest(formula = totalIncome ~ ., data = log(train), ntree = 100,      importance = TRUE, na.action = na.omit) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 0.5876366
##                     % Var explained: 84.16
## Show "importance" of variables: higher value mean more important:
round(importance(rf), 2)
##                  %IncMSE IncNodePurity
## danmakusCount      12.94       1226.67
## timeDuration        6.03        374.10
## watchCount         10.86        774.90
## interactionCount   11.39        851.80
## superchatCount     21.05       1716.66
## membershipCount    33.39       1956.38
## followers          15.63        238.00
lr<-lm(totalIncome ~ ., data=log(train))

summary(lr)
## 
## Call:
## lm(formula = totalIncome ~ ., data = log(train))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9761 -0.4902 -0.1065  0.4340  4.3495 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.06468    0.40714  12.440  < 2e-16 ***
## danmakusCount     0.15097    0.06343   2.380   0.0174 *  
## timeDuration      0.20044    0.04165   4.813 1.60e-06 ***
## watchCount        0.05507    0.04231   1.302   0.1932    
## interactionCount  0.16599    0.08943   1.856   0.0636 .  
## superchatCount    0.31034    0.01707  18.182  < 2e-16 ***
## membershipCount   0.46378    0.01443  32.136  < 2e-16 ***
## followers        -0.16234    0.03541  -4.584 4.84e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9092 on 1959 degrees of freedom
## Multiple R-squared:  0.7781, Adjusted R-squared:  0.7773 
## F-statistic: 981.2 on 7 and 1959 DF,  p-value: < 2.2e-16
plot(lr)

lr_fit=exp(predict(lr,log(test)))
lr_resid=(test$totalIncome)-(lr_fit)
summary(lr_resid)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -48501.02   -723.15    -17.41    762.87    430.99  93792.73
sqrt(mean(lr_resid^2))
## [1] 7311.825
mean(abs(lr_resid))
## [1] 2441.767
mean(abs(lr_resid)/max(test$totalIncome,1))
## [1] 0.008158422
rf_fit=exp(predict(rf,log(test)))
rf_resid=test$totalIncome - rf_fit
summary( rf_resid )
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -26669.39   -436.09    -84.67   1600.86    403.42 165570.88
sqrt(mean( rf_resid^2) )
## [1] 10634.32
mean( abs( rf_resid ) )
## [1] 2460.008
test[which.max(abs(rf_resid)),]
##      danmakusCount timeDuration totalIncome watchCount interactionCount
## 2227        210878     4.298397      299294     112758            25175
##      superchatCount membershipCount followers
## 2227           1724            1006   1339125
mean(abs(rf_resid)/max(test$totalIncome,1))
## [1] 0.008219368
varImpPlot(rf)

plot(rf)

Regression adjustment

proc0<-function(dt){
  for(i in 1:nrow(dt)){
  for(j in 1:ncol(dt)){
    dt[i,j]=max(dt[i,j],0.5)
  }
}
return(dt)
}
df_causal<-df_rf[,c(3:8,10,12)]

df_causal<-proc0(df_causal)
df_causal<-log(df_causal)
df_causal<-as.data.frame(t(t(df_causal)-colMeans(df_causal)) )
df_causal$W1=0
df_causal[df_rf$affiliation=="independent","W1"]=1
df_causal$W2=0
df_causal[(df_rf$weekday=="Sunday")|(df_rf$weekday=="Saturday"),"W2"]=1
df_causal<-df_causal[df_rf$totalIncome>=1,]
df_causal[,1:8]<-as.data.frame(t(t(df_causal[,1:8])-colMeans(df_causal[,1:8])) )
lr<-lm(totalIncome~.-W1-W2,df_causal)
summary(lr)
## 
## Call:
## lm(formula = totalIncome ~ . - W1 - W2, data = df_causal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5720 -0.3943 -0.0898  0.3380  3.7391 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -5.088e-15  1.640e-02   0.000   1.0000    
## danmakusCount     1.082e-01  5.177e-02   2.090   0.0368 *  
## timeDuration      8.846e-02  3.754e-02   2.356   0.0185 *  
## watchCount        2.797e-02  3.970e-02   0.705   0.4811    
## interactionCount -2.532e-02  7.501e-02  -0.338   0.7357    
## superchatCount    4.250e-01  1.892e-02  22.466   <2e-16 ***
## membershipCount   6.230e-01  1.579e-02  39.469   <2e-16 ***
## followers        -2.338e-02  3.093e-02  -0.756   0.4498    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.81 on 2432 degrees of freedom
## Multiple R-squared:  0.7793, Adjusted R-squared:  0.7787 
## F-statistic:  1227 on 7 and 2432 DF,  p-value: < 2.2e-16
plot(lr)

lr0<-lm(totalIncome~W1,df_causal)
summary(lr0)
## 
## Call:
## lm(formula = totalIncome ~ W1, data = df_causal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7086 -0.9311  0.0667  1.0746  5.8536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.22415    0.04228  -5.302 1.25e-07 ***
## W1           0.65499    0.07227   9.063  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.694 on 2438 degrees of freedom
## Multiple R-squared:  0.03259,    Adjusted R-squared:  0.03219 
## F-statistic: 82.13 on 1 and 2438 DF,  p-value: < 2.2e-16
with(df_causal, t.test( (totalIncome[W1==1]), (totalIncome[W1==0]) ) )
## 
##  Welch Two Sample t-test
## 
## data:  (totalIncome[W1 == 1]) and (totalIncome[W1 == 0])
## t = 9.356, df = 1845.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.5176906 0.7922950
## sample estimates:
##  mean of x  mean of y 
##  0.4308457 -0.2241471
lr1<-lm(totalIncome~(.-W2),df_causal)
summary(lr1)
## 
## Call:
## lm(formula = totalIncome ~ (. - W2), data = df_causal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5519 -0.3914 -0.0925  0.3515  3.8068 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -0.0503582  0.0207045  -2.432  0.01508 *  
## danmakusCount     0.1321863  0.0519717   2.543  0.01104 *  
## timeDuration      0.1199312  0.0382602   3.135  0.00174 ** 
## watchCount        0.0126697  0.0397688   0.319  0.75007    
## interactionCount -0.0666659  0.0755034  -0.883  0.37735    
## superchatCount    0.4227338  0.0188677  22.405  < 2e-16 ***
## membershipCount   0.6240051  0.0157398  39.645  < 2e-16 ***
## followers        -0.0003863  0.0313809  -0.012  0.99018    
## W1                0.1471545  0.0371261   3.964 7.59e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8075 on 2431 degrees of freedom
## Multiple R-squared:  0.7807, Adjusted R-squared:   0.78 
## F-statistic:  1082 on 8 and 2431 DF,  p-value: < 2.2e-16
plot(lr1)

lr2<-lm(totalIncome~(.-W2)+W1*(.-W2),df_causal)
summary(lr2)
## 
## Call:
## lm(formula = totalIncome ~ (. - W2) + W1 * (. - W2), data = df_causal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5868 -0.3897 -0.0630  0.3532  3.7454 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -0.0296278  0.0206699  -1.433  0.15188    
## danmakusCount        0.1022086  0.0614442   1.663  0.09635 .  
## timeDuration         0.0111641  0.0465597   0.240  0.81052    
## watchCount          -0.0009463  0.0602050  -0.016  0.98746    
## interactionCount     0.0052127  0.0975093   0.053  0.95737    
## superchatCount       0.4539843  0.0224526  20.220  < 2e-16 ***
## membershipCount      0.6297061  0.0191011  32.967  < 2e-16 ***
## followers            0.0285459  0.0418848   0.682  0.49560    
## W1                   0.2258040  0.0378903   5.959 2.90e-09 ***
## danmakusCount:W1     0.0780596  0.1127683   0.692  0.48887    
## timeDuration:W1      0.3491868  0.0820659   4.255 2.17e-05 ***
## watchCount:W1        0.0247559  0.0797540   0.310  0.75628    
## interactionCount:W1 -0.2190745  0.1603594  -1.366  0.17202    
## superchatCount:W1   -0.1059212  0.0405412  -2.613  0.00904 ** 
## membershipCount:W1   0.0018722  0.0332638   0.056  0.95512    
## followers:W1        -0.1248989  0.0648343  -1.926  0.05417 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.794 on 2424 degrees of freedom
## Multiple R-squared:  0.7886, Adjusted R-squared:  0.7873 
## F-statistic: 602.9 on 15 and 2424 DF,  p-value: < 2.2e-16
plot(lr2)

lr0<-lm(totalIncome~W2,df_causal)
summary(lr0)
## 
## Call:
## lm(formula = totalIncome ~ W2, data = df_causal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5281 -0.9537  0.0730  1.1194  5.7529 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.12350    0.04238  -2.914   0.0036 ** 
## W2           0.37388    0.07374   5.070 4.27e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.713 on 2438 degrees of freedom
## Multiple R-squared:  0.01044,    Adjusted R-squared:  0.01003 
## F-statistic: 25.71 on 1 and 2438 DF,  p-value: 4.268e-07
with(df_causal, t.test( (totalIncome[W2==1]), (totalIncome[W2==0]) ) )
## 
##  Welch Two Sample t-test
## 
## data:  (totalIncome[W2 == 1]) and (totalIncome[W2 == 0])
## t = 5.0329, df = 1571.9, p-value = 5.386e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2281671 0.5195881
## sample estimates:
##  mean of x  mean of y 
##  0.2503754 -0.1235022
lr1<-lm(totalIncome~(.-W1),df_causal)
summary(lr1)
## 
## Call:
## lm(formula = totalIncome ~ (. - W1), data = df_causal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5597 -0.3960 -0.0902  0.3331  3.7569 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -0.01731    0.02011  -0.861   0.3894    
## danmakusCount     0.11002    0.05178   2.125   0.0337 *  
## timeDuration      0.08530    0.03759   2.269   0.0233 *  
## watchCount        0.02786    0.03969   0.702   0.4828    
## interactionCount -0.02264    0.07501  -0.302   0.7628    
## superchatCount    0.42241    0.01899  22.244   <2e-16 ***
## membershipCount   0.62154    0.01581  39.303   <2e-16 ***
## followers        -0.02392    0.03093  -0.774   0.4393    
## W2                0.05240    0.03524   1.487   0.1372    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8098 on 2431 degrees of freedom
## Multiple R-squared:  0.7795, Adjusted R-squared:  0.7788 
## F-statistic:  1074 on 8 and 2431 DF,  p-value: < 2.2e-16
lr2<-lm(totalIncome~(.-W1)+W2*(.-W1),df_causal)
summary(lr2)
## 
## Call:
## lm(formula = totalIncome ~ (. - W1) + W2 * (. - W1), data = df_causal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5367 -0.3932 -0.0862  0.3410  3.7479 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -0.01926    0.02015  -0.956   0.3392    
## danmakusCount        0.06108    0.06374   0.958   0.3380    
## timeDuration         0.10828    0.04736   2.286   0.0223 *  
## watchCount           0.03763    0.04950   0.760   0.4472    
## interactionCount     0.05618    0.09170   0.613   0.5402    
## superchatCount       0.40472    0.02373  17.055   <2e-16 ***
## membershipCount      0.61288    0.01972  31.081   <2e-16 ***
## followers           -0.04956    0.03766  -1.316   0.1883    
## W2                   0.04798    0.03538   1.356   0.1751    
## danmakusCount:W2     0.15967    0.10990   1.453   0.1464    
## timeDuration:W2     -0.06040    0.07794  -0.775   0.4384    
## watchCount:W2       -0.02118    0.08303  -0.255   0.7986    
## interactionCount:W2 -0.24872    0.15991  -1.555   0.1200    
## superchatCount:W2    0.04424    0.03979   1.112   0.2663    
## membershipCount:W2   0.02200    0.03315   0.663   0.5071    
## followers:W2         0.07782    0.06619   1.176   0.2398    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8097 on 2424 degrees of freedom
## Multiple R-squared:  0.7802, Adjusted R-squared:  0.7789 
## F-statistic: 573.7 on 15 and 2424 DF,  p-value: < 2.2e-16
cor(lr2$residuals,df_causal$W2)
## [1] -2.776904e-16
df_rf$otherIncome<-with(df_rf,totalIncome-membershipIncome-superchatIncome)


hist(df_rf$otherIncome/max(df_rf$totalIncome,1))

summary(df_rf$otherIncome/max(df_rf$totalIncome,1))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000000 0.0002351 0.0007771 0.0025227 0.0023349 0.1172686

causal forest

library(grf)
set.seed(114514)
X=df_causal[,-c(3,9,10)]
Y=df_causal[,"totalIncome"]
W=df_causal[,"W1"]

forest.W <- regression_forest(X, W, tune.parameters = "all")
W.hat <- predict(forest.W)$predictions

forest.Y <- regression_forest(X, Y, tune.parameters = "all")
Y.hat <- predict(forest.Y)$predictions

tau.forest <- causal_forest(X, Y, W,
  W.hat = W.hat, Y.hat = Y.hat,
  tune.parameters = "all"
)
tau.hat <- predict(tau.forest)$predictions

summary(tau.hat)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.38225 -0.03152  0.19582  0.32277  0.75373  2.53658
average_treatment_effect(tau.forest, target.sample = "overlap",method="TMLE")
##  estimate   std.err 
## 0.2315368 0.1067300
average_treatment_effect(tau.forest, target.sample = "overlap",method="AIPW")
##  estimate   std.err 
## 0.2315368 0.1067300
ggplot(NULL, aes(x=df_causal$totalIncome, y=W.hat)) + 
  geom_point(aes(color=df_causal$W1))

summary(W.hat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01375 0.04009 0.07809 0.33909 0.80547 0.99647
ggplot(NULL, aes(W.hat, fill=factor(df_causal$W1) )) + 
  geom_density(alpha=.5) + 
  scale_fill_manual(values = c('#999999','#E69F00'))

summary(W.hat-W)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.881539 -0.045613  0.032345 -0.003121  0.055067  0.807624
forest.Y
## GRF forest object of type regression_forest 
## Number of trees: 2000 
## Number of training samples: 2440 
## Variable importance: 
##     1     2     3     4     5     6     7 
## 0.039 0.003 0.006 0.015 0.333 0.600 0.005
set.seed(114514)
X=df_causal[,-c(3,9,10)]
Y=df_causal[,"totalIncome"]
W=df_causal[,"W1"]

tau.forest <- causal_forest(X, Y, W,
  tune.parameters = "all"
)
tau.hat <- predict(tau.forest)$predictions

head(predict(tau.forest,estimate.variance = TRUE))
##   predictions variance.estimates debiased.error excess.error
## 1    1.737771          0.7248275     0.01183947 0.0002977976
## 2    1.937831          0.3315386     0.18403453 0.0001180892
## 3    2.086950          0.4820520     0.93942636 0.0008962427
## 4    2.044104          0.6409845     0.07063379 0.0001085127
## 5    2.204244          0.2505762     0.49207595 0.0002021150
## 6    1.751580          0.3623757     0.29835226 0.0011565514
summary(tau.hat)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.28502 -0.02547  0.21464  0.33527  0.75112  2.41368
average_treatment_effect(tau.forest, target.sample = "overlap")
##  estimate   std.err 
## 0.2366983 0.1052452
varimp <- variable_importance(tau.forest)
ranked.vars <- order(varimp, decreasing = TRUE)

# Top 5 variables according to this measure
colnames(X)[ranked.vars[1:7]]
## [1] "followers"        "timeDuration"     "interactionCount" "danmakusCount"   
## [5] "watchCount"       "superchatCount"   "membershipCount"
#> [1] "financial.autonomy.index"           "intention.to.save.index"           
#> [3] "family.receives.cash.transfer"      "has.computer.with.internet.at.home"
#> [5] "is.female"

best_linear_projection(tau.forest, X)
## Warning in best_linear_projection(tau.forest, X): Estimated treatment
## propensities take values between 0.011 and 0.998 and in particular get very
## close to 0 or 1.
## 
## Best linear projection of the conditional average treatment effect.
## Confidence intervals are cluster- and heteroskedasticity-robust (HC3):
## 
##                   Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)       0.361159   0.020051 18.0120 < 2.2e-16 ***
## danmakusCount     0.002989   0.063475  0.0471    0.9624    
## timeDuration      0.420571   0.050931  8.2577 2.407e-16 ***
## watchCount       -0.047197   0.061924 -0.7622    0.4460    
## interactionCount  0.014608   0.102956  0.1419    0.8872    
## superchatCount   -0.218872   0.023447 -9.3348 < 2.2e-16 ***
## membershipCount   0.041215   0.019364  2.1285    0.0334 *  
## followers        -0.281476   0.045170 -6.2315 5.429e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
best_linear_projection(tau.forest, X)
## Warning in best_linear_projection(tau.forest, X): Estimated treatment
## propensities take values between 0.011 and 0.998 and in particular get very
## close to 0 or 1.
## 
## Best linear projection of the conditional average treatment effect.
## Confidence intervals are cluster- and heteroskedasticity-robust (HC3):
## 
##                   Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)       0.361159   0.020051 18.0120 < 2.2e-16 ***
## danmakusCount     0.002989   0.063475  0.0471    0.9624    
## timeDuration      0.420571   0.050931  8.2577 2.407e-16 ***
## watchCount       -0.047197   0.061924 -0.7622    0.4460    
## interactionCount  0.014608   0.102956  0.1419    0.8872    
## superchatCount   -0.218872   0.023447 -9.3348 < 2.2e-16 ***
## membershipCount   0.041215   0.019364  2.1285    0.0334 *  
## followers        -0.281476   0.045170 -6.2315 5.429e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# See if a causal forest succeeded in capturing heterogeneity by plotting
# the TOC and calculating a 95% CI for the AUTOC.
set.seed(114514)
X=df_causal[,-c(3,9,10)]
Y=df_causal[,"totalIncome"]
W=df_causal[,"W1"]

n=nrow(df_causal)
train <- sample(1:n, n / 2)

forest.W <- regression_forest(X[train,], W[train], tune.parameters = "all")
W.hat <- predict(forest.W)$predictions

forest.Y <- regression_forest(X[train,], Y[train], tune.parameters = "all")
Y.hat <- predict(forest.Y)$predictions

train.forest <- causal_forest(X[train, ], Y[train], W[train], W.hat = W.hat, Y.hat = Y.hat,
  tune.parameters = "all")

forest.W <- regression_forest(X[-train,], W[-train], tune.parameters = "all")
W.hat <- predict(forest.W)$predictions

forest.Y <- regression_forest(X[-train,], Y[-train], tune.parameters = "all")
Y.hat <- predict(forest.Y)$predictions

eval.forest <- causal_forest(X[-train, ], Y[-train], W[-train], W.hat = W.hat, Y.hat = Y.hat,
  tune.parameters = "all")
rate <- rank_average_treatment_effect(eval.forest,
predict(train.forest, X[-train, ])$predictions)
plot(rate)

paste("AUTOC:", round(rate$estimate, 2), "+/", round(1.96 * rate$std.err, 2))
## [1] "AUTOC: 0.33 +/ 0.06"
set.seed(114514)
X=df_causal[,-c(3,9,10)]
Y=df_causal[,"totalIncome"]
W=df_causal[,"W2"]

forest.W <- regression_forest(X, W, tune.parameters = "all")
W.hat <- predict(forest.W)$predictions

forest.Y <- regression_forest(X, Y, tune.parameters = "all")
Y.hat <- predict(forest.Y)$predictions

tau.forest <- causal_forest(X, Y, W,
  W.hat = W.hat, Y.hat = Y.hat,
  tune.parameters = "all"
)
tau.hat <- predict(tau.forest)$predictions

head(predict(tau.forest,estimate.variance = TRUE))
##    predictions variance.estimates debiased.error excess.error
## 1 -0.024784535         0.02483971     0.06838806 0.0002614504
## 2 -0.007747133         0.01043775     0.30354459 0.0002241894
## 3 -0.042947017         0.02041626     0.02119088 0.0002437538
## 4 -0.029243547         0.03527779     0.17953407 0.0002984242
## 5  0.001676725         0.02149095     0.14924601 0.0003682184
## 6  0.031699562         0.01628176     0.01910503 0.0001907937
summary(tau.hat)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.27713  0.01919  0.07443  0.06096  0.11186  0.22084
average_treatment_effect(tau.forest, target.sample = "overlap")
##   estimate    std.err 
## 0.06347278 0.03007109
ggplot(NULL, aes(x=df_causal$totalIncome, y=W.hat)) + 
  geom_point(aes(color=df_causal$W2))

summary(W.hat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1929  0.2895  0.3202  0.3299  0.3557  0.6390
ggplot(NULL, aes(W.hat, fill=factor(df_causal$W2) )) + 
  geom_density(alpha=.5) + 
  scale_fill_manual(values = c('#999999','#E69F00'))

summary(W.hat-W)
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -0.8070922 -0.6077922  0.2855493 -0.0004587  0.3309460  0.6390002
forest.Y
## GRF forest object of type regression_forest 
## Number of trees: 2000 
## Number of training samples: 2440 
## Variable importance: 
##     1     2     3     4     5     6     7 
## 0.039 0.003 0.006 0.015 0.333 0.600 0.005
varimp <- variable_importance(tau.forest)
ranked.vars <- order(varimp, decreasing = TRUE)

# Top 5 variables according to this measure
colnames(X)[ranked.vars[1:7]]
## [1] "danmakusCount"    "timeDuration"     "watchCount"       "followers"       
## [5] "interactionCount" "superchatCount"   "membershipCount"
#> [1] "financial.autonomy.index"           "intention.to.save.index"           
#> [3] "family.receives.cash.transfer"      "has.computer.with.internet.at.home"
#> [5] "is.female"

best_linear_projection(tau.forest, X)
## 
## Best linear projection of the conditional average treatment effect.
## Confidence intervals are cluster- and heteroskedasticity-robust (HC3):
## 
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)       0.055731   0.030959  1.8002  0.07196 .
## danmakusCount     0.057955   0.112002  0.5174  0.60489  
## timeDuration     -0.021498   0.090219 -0.2383  0.81167  
## watchCount       -0.110506   0.140170 -0.7884  0.43056  
## interactionCount -0.016864   0.202127 -0.0834  0.93351  
## superchatCount    0.060470   0.037018  1.6335  0.10249  
## membershipCount   0.016470   0.025834  0.6375  0.52385  
## followers         0.052154   0.071730  0.7271  0.46724  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
set.seed(114514)
X=df_causal[,-c(3,9,10)]
Y=df_causal[,"totalIncome"]
W=df_causal[,"W2"]

tau.forest <- causal_forest(X, Y, W,
  tune.parameters = "all"
)
tau.hat <- predict(tau.forest)$predictions

head(predict(tau.forest,estimate.variance = TRUE))
##     predictions variance.estimates debiased.error excess.error
## 1 -0.0164990427         0.02844612     0.05402773 0.0005632331
## 2  0.0002077488         0.12623932     0.18840197 0.0004624321
## 3 -0.0488933046         0.06579101     0.14049751 0.0006234160
## 4  0.0088083348         0.23757168     0.04202226 0.0010338187
## 5  0.0014174466         0.23620999     0.28105350 0.0015134367
## 6  0.0359870202         0.07364137     0.01752424 0.0004741497
summary(tau.hat)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.426036  0.006774  0.075757  0.064282  0.127471  0.342546
average_treatment_effect(tau.forest, target.sample = "overlap")
##   estimate    std.err 
## 0.06423604 0.03012605
# See if a causal forest succeeded in capturing heterogeneity by plotting
# the TOC and calculating a 95% CI for the AUTOC.
set.seed(114514)
X=df_causal[,-c(3,9,10)]
Y=df_causal[,"totalIncome"]
W=df_causal[,"W2"]

n=nrow(df_causal)
train <- sample(1:n, n / 2)

forest.W <- regression_forest(X[train,], W[train], tune.parameters = "all")
W.hat <- predict(forest.W)$predictions

forest.Y <- regression_forest(X[train,], Y[train], tune.parameters = "all")
Y.hat <- predict(forest.Y)$predictions

train.forest <- causal_forest(X[train, ], Y[train], W[train], W.hat = W.hat, Y.hat = Y.hat,
  tune.parameters = "all")

forest.W <- regression_forest(X[-train,], W[-train], tune.parameters = "all")
W.hat <- predict(forest.W)$predictions

forest.Y <- regression_forest(X[-train,], Y[-train], tune.parameters = "all")
Y.hat <- predict(forest.Y)$predictions

eval.forest <- causal_forest(X[-train, ], Y[-train], W[-train], W.hat = W.hat, Y.hat = Y.hat,
  tune.parameters = "all")
rate <- rank_average_treatment_effect(eval.forest,
predict(train.forest, X[-train, ])$predictions)
plot(rate)

paste("AUTOC:", round(rate$estimate, 2), "+/", round(1.96 * rate$std.err, 2))
## [1] "AUTOC: 0.01 +/ 0.09"

matchit

library(MatchIt)
m.out <- matchit(W1 ~ danmakusCount+timeDuration+watchCount+interactionCount+superchatCount+membershipCount+followers, data = df_causal,
                 method = "full", estimand = "ATE")
summary(m.out)
## 
## Call:
## matchit(formula = W1 ~ danmakusCount + timeDuration + watchCount + 
##     interactionCount + superchatCount + membershipCount + followers, 
##     data = df_causal, method = "full", estimand = "ATE")
## 
## Summary of Balance for All Data:
##                  Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance                0.4422        0.2902          0.8810     1.4141
## danmakusCount           0.2969       -0.1545          0.3814     1.0152
## timeDuration           -0.0784        0.0408         -0.2258     0.9465
## watchCount              0.3887       -0.2022          0.5180     0.8011
## interactionCount        0.3669       -0.1909          0.5308     0.7993
## superchatCount          0.4188       -0.2179          0.3990     1.0045
## membershipCount         0.2352       -0.1223          0.2602     0.9848
## followers               0.1631       -0.0849          0.2754     0.8238
##                  eCDF Mean eCDF Max
## distance            0.2293   0.3282
## danmakusCount       0.1170   0.1898
## timeDuration        0.0678   0.1306
## watchCount          0.1551   0.2387
## interactionCount    0.1578   0.2653
## superchatCount      0.0653   0.2010
## membershipCount     0.0196   0.1243
## followers           0.1293   0.3224
## 
## Summary of Balance for Matched Data:
##                  Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance                0.3422        0.3414          0.0048     1.0281
## danmakusCount           0.0449       -0.0072          0.0441     0.9499
## timeDuration            0.0078       -0.0290          0.0697     1.0163
## watchCount              0.0874       -0.0076          0.0832     0.7663
## interactionCount        0.0545        0.0009          0.0510     0.7938
## superchatCount          0.0049       -0.0243          0.0183     1.0547
## membershipCount         0.0248       -0.0131          0.0276     0.8481
## followers               0.1042        0.0415          0.0696     0.9074
##                  eCDF Mean eCDF Max Std. Pair Dist.
## distance            0.0012   0.0131          0.0116
## danmakusCount       0.0307   0.0613          1.0118
## timeDuration        0.0278   0.0682          1.1303
## watchCount          0.0539   0.1164          0.9251
## interactionCount    0.0500   0.1007          0.9408
## superchatCount      0.0133   0.0730          1.0292
## membershipCount     0.0143   0.0363          1.0896
## followers           0.0784   0.2197          1.2585
## 
## Sample Sizes:
##               Control Treated
## All           1605.    835.  
## Matched (ESS) 1008.56  379.95
## Matched       1605.    835.  
## Unmatched        0.      0.  
## Discarded        0.      0.
plot(summary(m.out))

m.data <- match.data(m.out)

head(m.data)
##   danmakusCount timeDuration totalIncome watchCount interactionCount
## 1    -0.7696378   0.09821249   0.1919227 -1.0381997       -0.9292886
## 2    -0.4105296   0.16726665   2.0330875 -0.8488732       -0.6023242
## 3    -0.9621366  -0.16904624   0.9786163 -1.2253391       -0.9662896
## 4    -0.3394832   0.39824559   1.7372021 -0.9497793       -0.7233869
## 5    -1.4040206   1.54300502  -0.7764418 -0.7478239       -1.2056391
## 6    -1.3862329  -0.28382835   0.1947693 -1.4512956       -1.3188552
##   superchatCount membershipCount followers W1 W2  distance   weights subclass
## 1      0.6975887     -1.84848200  -1.93097  1  0 0.4085820 0.6844262        1
## 2      0.4099066      1.24256045  -1.93097  1  0 0.4231426 0.6844262       99
## 3     -0.1290899      1.04188975  -1.93097  1  0 0.4381860 0.6844262      204
## 4      0.8694390      0.63642465  -1.93097  1  1 0.3026124 0.6844262      300
## 5     -0.9763877     -0.05672253  -1.93097  1  1 0.1363506 4.7909836      393
## 6      0.2275851     -0.46218764  -1.93097  1  0 0.4233618 0.6844262      478
library("marginaleffects")
set.seed(114514)
X=m.data[,c(1:2,4:8)]
Y=m.data[,"totalIncome"]
W=m.data[,"W1"]


tau.forest <- causal_forest(X, Y, W,
  tune.parameters = "all",sample.weights = m.data$weights
)
tau.hat <- predict(tau.forest)$predictions

head(predict(tau.forest,estimate.variance = TRUE))
##   predictions variance.estimates debiased.error excess.error
## 1    2.732758           2.499211     0.71491608 0.0005750609
## 2    2.460348           3.777838     0.20360405 0.0003453791
## 3    2.908501           1.588693     0.60327193 0.0018188093
## 4    2.965362           1.774319     0.04601626 0.0004729487
## 5    2.942272           1.088220     0.58561488 0.0007510384
## 6    2.968560           1.680967     0.65850546 0.0019462661
summary(tau.hat)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.24521 -0.08183  0.25090  0.44143  0.91377  4.28008
average_treatment_effect(tau.forest, target.sample = "all")
## Warning in average_treatment_effect(tau.forest, target.sample = "all"):
## Estimated treatment propensities take values between 0.01 and 0.997
## and in particular get very close to 0 and 1. In this case, using
## `target.sample=overlap`, or filtering data as in Crump, Hotz, Imbens, and Mitnik
## (Biometrika, 2009) may be helpful.
##   estimate    std.err 
## 0.48884385 0.03863781
average_treatment_effect(tau.forest, target.sample = "treated")
## Warning in average_treatment_effect(tau.forest, target.sample =
## "treated"): Estimated treatment propensities take values between 0.01 and
## 0.997 and in particular get very close to 0 and 1. In this case, using
## `target.sample=overlap`, or filtering data as in Crump, Hotz, Imbens, and Mitnik
## (Biometrika, 2009) may be helpful.
##   estimate    std.err 
## 0.44586335 0.09154942
average_treatment_effect(tau.forest, target.sample = "control")
## Warning in average_treatment_effect(tau.forest, target.sample =
## "control"): Estimated treatment propensities take values between 0.01 and
## 0.997 and in particular get very close to 0 and 1. In this case, using
## `target.sample=overlap`, or filtering data as in Crump, Hotz, Imbens, and Mitnik
## (Biometrika, 2009) may be helpful.
##   estimate    std.err 
## 0.54881240 0.07219962
average_treatment_effect(tau.forest, target.sample = "overlap")
##  estimate   std.err 
## 0.3094175 0.1578538
library("marginaleffects")

fit <- lm(totalIncome~(.-W2-distance-weights-subclass)+W1*(.-W2-distance-weights-subclass),data = m.data, weights = weights)

avg_comparisons(fit,
                variables = "W1",
                vcov = ~subclass,
                wts = "weights")
## 
##  Term Contrast Estimate Std. Error    z Pr(>|z|) 2.5 % 97.5 %
##    W1    1 - 0    0.217     0.0483 4.49   <0.001 0.122  0.311
## 
## Columns: term, contrast, estimate, std.error, statistic, p.value, conf.low, conf.high
library("marginaleffects")

fit <- lm(totalIncome~(.-W2-distance-weights-subclass),data = m.data, weights = weights)

avg_comparisons(fit,
                variables = "W1",
                vcov = ~subclass,
                wts = "weights")
## 
##  Term Contrast Estimate Std. Error    z Pr(>|z|) 2.5 % 97.5 %
##    W1    1 - 0    0.213     0.0545 3.91   <0.001 0.106   0.32
## 
## Columns: term, contrast, estimate, std.error, statistic, p.value, conf.low, conf.high
library("marginaleffects")

fit <- lm(totalIncome~W1,data = m.data, weights = weights)

avg_comparisons(fit,
                variables = "W1",
                vcov = ~subclass,
                wts = "weights")
## 
##  Term Contrast Estimate Std. Error    z Pr(>|z|)  2.5 % 97.5 %
##    W1    1 - 0    0.259     0.0954 2.72  0.00655 0.0724  0.446
## 
## Columns: term, contrast, estimate, std.error, statistic, p.value, conf.low, conf.high
library(MatchIt)
m.out2 <- matchit(W2 ~ danmakusCount+timeDuration+watchCount+interactionCount+superchatCount+membershipCount+followers, data = df_causal,
                 method = "full", estimand = "ATE")
summary(m.out2)
## 
## Call:
## matchit(formula = W2 ~ danmakusCount + timeDuration + watchCount + 
##     interactionCount + superchatCount + membershipCount + followers, 
##     data = df_causal, method = "full", estimand = "ATE")
## 
## Summary of Balance for All Data:
##                  Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance                0.3456        0.3228          0.3228     1.3382
## danmakusCount           0.0595       -0.0294          0.0742     0.9682
## timeDuration            0.0484       -0.0239          0.1351     1.0882
## watchCount              0.0298       -0.0147          0.0373     1.0158
## interactionCount        0.0323       -0.0159          0.0440     0.9614
## superchatCount          0.2064       -0.1018          0.1903     1.0208
## membershipCount         0.1883       -0.0929          0.2025     1.0784
## followers              -0.0108        0.0053         -0.0173     1.0740
##                  eCDF Mean eCDF Max
## distance            0.0881   0.1512
## danmakusCount       0.0211   0.0444
## timeDuration        0.0427   0.0801
## watchCount          0.0129   0.0374
## interactionCount    0.0099   0.0363
## superchatCount      0.0296   0.0860
## membershipCount     0.0174   0.1022
## followers           0.0147   0.0340
## 
## Summary of Balance for Matched Data:
##                  Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance                0.3303        0.3303          0.0006     1.0008
## danmakusCount           0.0369        0.0055          0.0263     0.9822
## timeDuration            0.0254       -0.0005          0.0483     1.0391
## watchCount              0.0561        0.0057          0.0422     1.1170
## interactionCount        0.0485        0.0061          0.0386     0.9634
## superchatCount          0.0382       -0.0038          0.0260     1.0051
## membershipCount        -0.0123        0.0221         -0.0248     0.8547
## followers               0.0553        0.0066          0.0526     1.0594
##                  eCDF Mean eCDF Max Std. Pair Dist.
## distance            0.0008   0.0053          0.0048
## danmakusCount       0.0106   0.0364          1.1241
## timeDuration        0.0223   0.0535          0.9996
## watchCount          0.0165   0.0454          1.1424
## interactionCount    0.0100   0.0325          1.1534
## superchatCount      0.0054   0.0217          0.8874
## membershipCount     0.0131   0.0365          0.8121
## followers           0.0210   0.0509          1.1745
## 
## Sample Sizes:
##               Control Treated
## All           1634.    806.  
## Matched (ESS) 1409.12  564.08
## Matched       1634.    806.  
## Unmatched        0.      0.  
## Discarded        0.      0.
plot(summary(m.out2))

m.data2 <- match.data(m.out2)

head(m.data2)
##   danmakusCount timeDuration totalIncome watchCount interactionCount
## 1    -0.7696378   0.09821249   0.1919227 -1.0381997       -0.9292886
## 2    -0.4105296   0.16726665   2.0330875 -0.8488732       -0.6023242
## 3    -0.9621366  -0.16904624   0.9786163 -1.2253391       -0.9662896
## 4    -0.3394832   0.39824559   1.7372021 -0.9497793       -0.7233869
## 5    -1.4040206   1.54300502  -0.7764418 -0.7478239       -1.2056391
## 6    -1.3862329  -0.28382835   0.1947693 -1.4512956       -1.3188552
##   superchatCount membershipCount followers W1 W2  distance   weights subclass
## 1      0.6975887     -1.84848200  -1.93097  1  0 0.3683040 1.3393443      632
## 2      0.4099066      1.24256045  -1.93097  1  0 0.4205506 0.8036066      580
## 3     -0.1290899      1.04188975  -1.93097  1  0 0.4030760 0.8928962       84
## 4      0.8694390      0.63642465  -1.93097  1  1 0.4472629 0.4129098        1
## 5     -0.9763877     -0.05672253  -1.93097  1  1 0.4723792 0.4404372      108
## 6      0.2275851     -0.46218764  -1.93097  1  0 0.4043168 1.3393443      241
library("marginaleffects")

fit <- lm(totalIncome~(.-W1-distance-weights-subclass)+W2*(.-W1-distance-weights-subclass),data = m.data2, weights = weights)

avg_comparisons(fit,
                variables = "W2",
                vcov = ~subclass,
                wts = "weights")
## 
##  Term Contrast Estimate Std. Error    z Pr(>|z|)   2.5 % 97.5 %
##    W2    1 - 0   0.0213     0.0402 0.53    0.596 -0.0575    0.1
## 
## Columns: term, contrast, estimate, std.error, statistic, p.value, conf.low, conf.high
library("marginaleffects")

fit <- lm(totalIncome~(.-W1-distance-weights-subclass),data = m.data2, weights = weights)

avg_comparisons(fit,
                variables = "W2",
                vcov = ~subclass,
                wts = "weights")
## 
##  Term Contrast Estimate Std. Error    z Pr(>|z|)   2.5 % 97.5 %
##    W2    1 - 0   0.0206     0.0404 0.51     0.61 -0.0586 0.0999
## 
## Columns: term, contrast, estimate, std.error, statistic, p.value, conf.low, conf.high
library("marginaleffects")

fit <- lm(totalIncome~W2,data = m.data2, weights = weights)

avg_comparisons(fit,
                variables = "W2",
                vcov = ~subclass,
                wts = "weights")
## 
##  Term Contrast Estimate Std. Error     z Pr(>|z|)  2.5 % 97.5 %
##    W2    1 - 0   0.0216     0.0694 0.311    0.755 -0.114  0.158
## 
## Columns: term, contrast, estimate, std.error, statistic, p.value, conf.low, conf.high
library("marginaleffects")
set.seed(114514)
X=m.data2[,c(1:2,4:8)]
Y=m.data2[,"totalIncome"]
W=m.data2[,"W2"]


tau.forest <- causal_forest(X, Y, W,
  tune.parameters = "all",sample.weights = m.data2$weights
)
tau.hat <- predict(tau.forest)$predictions

head(predict(tau.forest,estimate.variance = TRUE))
##   predictions variance.estimates debiased.error excess.error
## 1  0.01286693        0.007477905    0.192365005 8.210039e-05
## 2  0.07368682        0.005873541    0.481168133 7.265610e-05
## 3  0.01732016        0.001862600    0.001972586 5.948801e-05
## 4  0.05092729        0.010353448    0.327539426 1.260941e-04
## 5 -0.02185859        0.035880638    0.054570047 1.607373e-04
## 6  0.09250286        0.008440955    0.150049071 6.706317e-05
summary(tau.hat)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.23021 -0.03187  0.02716  0.01667  0.07294  0.22168
average_treatment_effect(tau.forest, target.sample = "all")
##   estimate    std.err 
## 0.01634060 0.03535471
average_treatment_effect(tau.forest, target.sample = "treated")
##   estimate    std.err 
## 0.02101583 0.03467628
average_treatment_effect(tau.forest, target.sample = "control")
##   estimate    std.err 
## 0.01403770 0.03631965
average_treatment_effect(tau.forest, target.sample = "overlap")
##   estimate    std.err 
## 0.01789833 0.03487897

linear model

summary( lm(totalIncome~(.),data=df_causal) )
## 
## Call:
## lm(formula = totalIncome ~ (.), data = df_causal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5903 -0.3885 -0.0908  0.3485  3.8241 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -0.0671977  0.0236867  -2.837  0.00459 ** 
## danmakusCount     0.1339164  0.0519730   2.577  0.01003 *  
## timeDuration      0.1167450  0.0383133   3.047  0.00234 ** 
## watchCount        0.0125995  0.0397595   0.317  0.75135    
## interactionCount -0.0639256  0.0755090  -0.847  0.39730    
## superchatCount    0.4202298  0.0189408  22.186  < 2e-16 ***
## membershipCount   0.6225290  0.0157685  39.479  < 2e-16 ***
## followers        -0.0009806  0.0313762  -0.031  0.97507    
## W1                0.1467601  0.0371184   3.954 7.91e-05 ***
## W2                0.0513868  0.0351373   1.462  0.14375    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8073 on 2430 degrees of freedom
## Multiple R-squared:  0.7809, Adjusted R-squared:  0.7801 
## F-statistic: 962.5 on 9 and 2430 DF,  p-value: < 2.2e-16
plot( lm(totalIncome~(.),data=df_causal) )